********************************************************************************
* Compare RAND-ALP Bundles with CES
* Paper Figure 1.b 
********************************************************************************

clear
clear matrix
set more off
set scheme s1color
estimates clear
graph drop _all
set matsize 2500
log close _all
graph set window fontface "Times New Roman"

** Set Directory
cd "../Do"
** Set Haver Directory
// set haverdir "Haver", perm

********************************************************************************
** HAVER PULL DATA
********************************************************************************
** Import Data on nondurables spending
import haver cesxf@surveys cesxhu@surveys cesxp@surveys cesxtg@surveys cesxhkl@surveys ///
	cesxmm@surveys cesxmd@surveys cesxms@surveys cesxhop@surveys cesxhp@surveys cesxop@surveys ///
	cesxef@surveys cesxeo@surveys cesxhfp@surveys cesxv@surveys cesxmi@surveys
rename (cesxf cesxhu cesxp cesxtg cesxhkl cesxmm cesxmd cesxms cesxhop cesxhp cesxop cesxef cesxeo cesxhfp cesxv cesxmi) ///
	(food utilities_fuels apparel_services gas_oil laundry_clean med_serv med_drugs med_supplies ///
	household_oth household_ops pers_care entertainment_fees entertainment_oth pets_toys audio_visual health_insurance)

*a series of bundles from least inclusive to most inclusive; none of the bundles includes health insurance, as this is never in our total
*first bundle omits household operations, audio-visual equip and services, and other entertainment
gen ces_nondur_minus_hhops_av_othent=food + utilities_fuels + apparel_services + gas_oil + laundry_clean + med_serv + med_drugs + med_supplies ///
	+ household_oth + pers_care + entertainment_fees + pets_toys
rename ces_nondur_minus_hhops_av_othent ces_minimal
*next bundle omits household operations and audio-visual equip. and services
gen ces_nondur_minus_hhops_av=food + utilities_fuels + apparel_services + gas_oil + laundry_clean + med_serv + med_drugs + med_supplies ///
	+ household_oth + pers_care + entertainment_fees + entertainment_oth + pets_toys
rename ces_nondur_minus_hhops_av ces_intermediate
*next bundle omits just audio-visual equip. and services
gen ces_nondur_minus_AV = food + utilities_fuels + apparel_services + gas_oil + laundry_clean + med_serv + med_drugs + med_supplies ///
	+ household_oth + pers_care + entertainment_fees + entertainment_oth + household_ops + pets_toys 
*max bundle includes everything imported except health insurance
gen ces_nondur_maximal = food + utilities_fuels + apparel_services + gas_oil + laundry_clean + med_serv + med_drugs + med_supplies ///
	+ household_oth + pers_care + entertainment_fees + entertainment_oth + household_ops + pets_toys + audio_visual

tempfile haverd
save `haverd'

********************************************************************************
** LOAD AND CLEAN NONDURABLES DATA
********************************************************************************

use ../Data/matched_data_nondurables_jun2018_baseline.dta, clear

*list of spending categories that will add up to nondurables (first version excludes only sports, which is excluded because it has durable goods in it)
#delimit ;
local spendcats "
electricity water heatingfuel phonecable housecleaningproducts housecleaningservice
gardenproducts gardenservice clothing personalcare drugs healthcareservices medsupplies entertainment
hobbies personalservices otherchildspending foodhome foodout gasoline";
#delimit cr

*based on content of "spendcats" above, nondurables defined as all included in that group except sports
egen nondurables = rowtotal(`spendcats')

*not that nondurables has one outlier value over 29000 (for monthly spending on nondurable goods/services) 
*before running some regressions, extreme values for nondurables will be omitted; a small number of zeroes will be dropped

drop ethnicity

recode mort (5=0)
recode stocks (5=0)
recode retacct (5=0)
recode howner (5=0)

* Recode some expectations variables from the Inflation surveys. 

* unemployment dummies
recode q6 (1=1) (2 3 = 0), gen(unemp_increase)
recode q6 (3=1) (1 2 = 0), gen(unemp_decrease)

gen conditions_12m = q2a
* note everybody who says "other" is in separate category
recode q2a (1=1) (2 3 = 0), gen(conditions_12m_better)
recode q2a (2=1) (1 3 = 0), gen(conditions_12m_worse)

gen interestrate_12m = q7
recode interestrate_12m (1=1) (2 3 = 0), gen(intrate_12m_up)
recode interestrate_12m (3=1) (1 2 = 0), gen(intrate_12m_down)

gen bconditions_12m = q4
* note everybody who says "other" is in separate category
recode q4 (1=1) (2 3 = 0), gen(bconditions_12m_better)
recode q4 (2=1) (1 3 = 0), gen(bconditions_12m_worse)

* house price forecasts
gen hppoint = .
replace hppoint = 0 if q41==3
replace hppoint = q42 if q41==1
replace hppoint = -q42 if q41==2
* there are some extreme outliers (in the tens of thousands)
replace hppoint = . if abs(hppoint)>=200
la var hppoint "House price expectation"

* Deflation: using 2012q1 PCUN (nondurables CPI): generate a new variable so regressions can be run on nominal and real data alternatively
* from Arman: note that PCUN was redefined in the haver do file to give 2012q1 dollars

rename spend_month date_monthly
sort date_monthly
merge m:1 date_monthly using "../Data/haver_m.dta"

tab _merge
drop if _merge==2
drop _merge

*MB: my simple deflation command: just deflating my main measure of nondurables
gen nondurables_real=nondurables/PCUN
rename date_monthly spend_month
*note real values are only a little higher on average than nominal values

*** Prepare some additional descriptives
* Race isn't reported in all periods. 
preserve
collapse (mean) race , by(prim_key)
sort prim_key
tempfile race
save `race'
restore

drop race
sort prim_key
merge m:1 prim_key using `race'
tab _merge
drop _merge
drop if prim_key==""

recode race (1=1) (nonmissing = 0), gen(white)
gen nonwhite = 1-white
recode gender (2=1) (1=0), gen(female)
recode highesteducation (4 9 = 0) (10/16 = 1), gen(coll)

* q31s* gives codes. Kind of odd that they've created separate variables. 
*MB: odd to generate employed variable, because everyone in sample is employed--however some also say they are retired (?) 
gen employed = . 
replace employed=1 if q31s1==1
replace employed = 0 if (q31s2==2 | q31s3==3 | q31s4==4 | q31s5==5 | q31s6==6 | q31s7==7) & q31s1!=1

* generate currently retired variable.
*odd that some are retired even though all are employed; how do retired people have wage growth expectations? they may be working a small job anyway
drop retired 
gen retired = . 
replace retired = 1 if q31s5==5
replace retired = 0 if (q31s1==1 | q31s2==2 | q31s3==3 | q31s4==4 | q31s6==6 | q31s7==7) & q31s5!=5
tab retired

*replace howner=0 if howner==.
*adds ten people to homeowner status; there are still some with howner_fix==0 who have positive mortgage payment, but missing data for has mortgage and/or mortgage amount
*two observations have howner==0 and mort==1 (say they're not a homeowner but they say they have a mortgage); not important so not recoding them for now

*the mortgage dummy is non-missing for over 2000 observations; however the mortgage balance (amtmort) is observed only for 990 people
*tab mort
*reasonable recode of mortgage dummy based on other information
replace mort = 0 if (howner!=1 | mortgage==0) & mort==.
replace mort = 1 if (howner==1 | (mortgage>0 & mortgage!=.)) & mort==.
la var mort "Mortgage Indicator"

*MB: generating quasi-continuous income variable based on midpoint of ranges of annual household income variables (familyincome and familyincome_part2)
rename familyincome inc2
rename familyincome_part2 inc2_2
drop if inc2==.
*here is the income variable: "new_faminc"
gen new_faminc=.
replace new_faminc=2500 if inc2==1
replace new_faminc=6250 if inc2==2
replace new_faminc=8750 if inc2==3
replace new_faminc=11250 if inc2==4
replace new_faminc=13750 if inc2==5
replace new_faminc=17500 if inc2==6
replace new_faminc=22500 if inc2==7
replace new_faminc=27500 if inc2==8
replace new_faminc=32500 if inc2==9
replace new_faminc=37500 if inc2==10
replace new_faminc=45000 if inc2==11
replace new_faminc=55000 if inc2==12
replace new_faminc=67500 if inc2==13
replace new_faminc=87500 if inc2==14 & (inc2_2==1 | inc2_2==.)
*above accounts for one person with inc2==14 and inc2_2 missing; not sure why that's the case but I asigned them the lowest category of income over $75000
replace new_faminc=112500 if inc2==14 & inc2_2==2
replace new_faminc=162500 if inc2==14 & inc2_2==3
replace new_faminc=237500 if inc2==14 & inc2_2==4

gen log_new_faminc=log(new_faminc)

sort prim_key spend_month
*bringing in SAMPLE WEIGHTS, to monthly data: warning, we might lose observations if the set requiring weights has changed ; this will affect our ability to run regs using non-employed types (can only do unweighted) 
merge m:1 prim_key using ../Data/mweights_pooled
*these are the reg weights, the full sample weights will still be called weight_full
*weight variable is called "weight_samp" to indicate the weights were designed for the regression sample
*195 observations dropped that didn't merge with a weight_samp
drop if _merge!=3
drop _merge

*define regression sample before recentering any variables
*drop people with extreme values for spending, inflation expectations, mortgage payment that looks like a mortgage balance
*outliers identified by Ali: turn on as alternative MB: the below prim_key doesn't look suspicious to me---this drop may have been related to the previous (bad) income variable
*drop if prim_key=="5041140:1"
*drops 
drop if mortgage>200000 & mortgage!=.
*drops 17 observations
drop if d_inflmedian>35
*drops 13 observations
drop if nondurables==0
*drops 1 observation
drop if nondurables>28000

*assigning locals for weights (can turn on or off in regression)
local weights "[pweight=weight_samp]"
local weights_full "[pweight=weight_full]"
local pwfile "_pw"
*drop those with missing values for regressors
la var d_inflmedian "Inflation Expectation"
la var d_infliqr "Inflation Uncertainty"
la var d_longinflmedian "Inflation Expectation"
la var d_longinfliqr "Inflation Uncertainty"
la var lag_IE "Lagged Inflation Expectation"
la var lag_infl_iqr "Lagged Infl. Uncertainty"

*variable for sum of monthly payments--interactions between this variable and IE may be included in some models
gen payments=mortgage+car if howner==1
replace payments=rent+car if howner!=1
gen log_payments=.
replace log_payments=log(payments) if payments>0
replace log_payments=0 if payments==0
la var log_payments "Monthly Payments (Log)"
*drop one observation with extreme value for payments (110,000): only if running a regression interacting with payments
drop if payments>100000

*defining sample; removing those with missing values
drop if weight_samp==.
drop if d_inflmedian==.
drop if d_infliqr==.
drop if nondurables==.
*dropping lagged IE: only need if we include lag IE in regression 
drop if lag_IE==.
drop if lag_infl_iqr==.
*new income variable: new_faminc is recode of categorical variables familyincome and familyincome_part2; former variable was earnings last month and highly unreliable
drop if new_faminc==.
drop if intrate_12m_up==.
drop if intrate_12m_down==.
drop if unemp_increase==.
drop if unemp_decrease==.
drop if rw_expect==.
drop if d_wageiqr==.
drop if rage==.
drop if nonwhite==.
drop if female==.
drop if coll==.
drop if retired==.
drop if mort==.
*alternative: comment out the last two drops and omit howner and hppoint from regressions 
*below results in loss of 300+ observations--will retain alternative and omit hppoint from the regression
drop if hppoint==.
*this drops 111 observations--can run alternative model without howner dummy and include these 
drop if howner==.

destring prim_key, generate(id_new) ignore(":")

*generate variable that equals 1 in all cases, to sum to determine observations per person
gen pre_obs=1
*generate sum of observations per person
egen obs=total(pre_obs), by(id_new)
sum obs, d
*define sample  based on sufficient observation
gen nondurables_sample_base=(obs>=3)

*generate variable that equals 1 in all cases, to sum to determine observations per person
gen pre_obs2=1
*generate sum of observations per person
egen obs2=total(pre_obs2), by(id_new)
sum obs2, d
***Important line here: define sample  based on sufficient observations
gen nondurables_sample_hp=(obs2>=3)

gen exp_year = year(exp_date)
gen exp_month_2 = month(exp_date)
gen exp_yearmo = ym(exp_year, exp_month)
format exp_yearmo %tm
rename exp_date expectations_date

tempfile nondur 
save `nondur'

********************************************************************************
* CREATE FIGURE
********************************************************************************

use `nondur', clear
collapse (mean) nondurables if nondurables_sample_hp==1 [pweight=weight_samp], by(spend_month)
gen year = yofd(dofm(spend_month))
tsset spend_month
*below is taking ratio of current monthly nondurables to its 12-month lag; we will impute missing values for 2011 and 2012 assuming that there is a constant ratio between any pair of matched months across the years,
*based on the average ratio of spending in a given month in 2012 to spending for the same month in 2011. 
*this generates the ratio for each month (ratio of current to its 12-month lag)
gen ratio_to_12mo_lag_spend=nondurables/l12.nondurables
*below gives the within-year average of the ratios of current to 12-month lagged monthly spending--to be used to extrapolate missing values
by year, sort: egen ave_ratio=mean(ratio_to_12mo_lag_spend)
*fill in missing months in the time series
tsfill, full
*drop the year variable and recreate in order to fill in values for newly included months (formerly missing months) 
drop year
gen year=yofd(dofm(spend_month))
**filling in missing values for average ratio with appropriate value (always points to a value in the same calendar year in our data's cases)
replace ave_ratio=l1.ave_ratio if ave_ratio==.
*impute nondurables spending for November 2012 as 0.85 of nondurables spending from 2011--this is the average relationship of 2012 spend to 2011 spend in the 10 months of overlapping observations
replace nondurables=ave_ratio*l12.nondurables if year==2012 & nondurables==. 
*impute nondurables spending for July 2011 as 1/0.85 time nondurables spending from July 2012--same comment as above (average relationship of 2011 spend to 2012 spend in 10 months of overlapping observations) 
replace nondurables=f12.nondurables/f12.ave_ratio if year==2011 & nondurables==.
*regenerates ratio of current month spending to 12-month lagged spending, just to make sure everything checks out and imputed months take the average ratio (based on how they were constructed) 
gen new_ratio_to_12mo_lag_spend=nondurables/l12.nondurables
*nethod 1: next calculate share of months 4 and 5 in annual spending for 2011 and 2012, same for months 11 and 12; then infer total spending for 2009 and 2010 based on weighting up the sum of existing months
by year, sort: egen annual_spend=sum(nondurables)
gen month_share=nondurables/annual_spend
gen month_simple=month(dofm(spend_month))
*reset time variable to month_simple (if we can do that)
tsset spend_month
sort spend_month
*share of combined april and may spending out of annual spending, by year (only populates for 2011 and 2012)
gen share_april_may=month_share+f1.month_share if month_simple==4
*now average the april-may share between 2011 and 2012 and populate it in all observations
egen scale_april_may=mean(share_april_may)
*now do same for november and december
sort spend_month
*have to exclude 2009 because share of nov and dec out of annual spending equals 1; have to exclude 2010 because it's also not a complete year and share will be too large
gen share_nov_dec=month_share+f1.month_share if month_simple==11 & year>=2011
egen scale_nov_dec=mean(share_nov_dec)
*replace existing value for annual_spend for 2009 with scaled-up value: divide existing value by the share of Nov&Dec in the total (based on years with complete data)
replace annual_spend=annual_spend/scale_nov_dec if year==2009
*replace existing value for annual_spend for 2010 with scaled-up value: divide existing value by complement of April-May share of the total (1-april-may-share)
replace annual_spend=annual_spend/(1-scale_april_may) if year==2010
*now plot the annual_spend values against different CES bundles
rename annual_spend nondurables_annual

collapse (mean) nondurables_annual, by(year)

tsset year
tempfile nondur_year 
save `nondur_year'

use `haverd', clear
rename time year
merge 1:1 year using `nondur_year', gen(havermatch)

sort year
keep if havermatch==3

gen ratio_alp_ces=nondurables_annual/ces_nondur_minus_AV
egen ave_ratio_alp_ces=mean(ratio_alp_ces)

la var ces_intermediate "Nondurable Goods Spending, CES Intermediate Bundle"
*intermediate bundle includes all imported CES categories from Haver (from above) except AV equipment and services, household operations, and health insurance
la var ces_nondur_minus_AV "Nondurables Goods Spending, CES Bundle Minus AV"
*the "minus_AV" bundle includes all imported CES categories from Haver (from above) except AV equipment and services, and health insurance
*see table notes below for complete list of items in each bundle
la var ces_nondur_maximal "Nondurable Goods Spending, CES Maximal Bundle" 
*the "maximal" bundle includes all imported CES categories from Haver (from above) except health insurance

la var nondurables_annual "Nondurable Goods Spending, Regression Sample" 
	
tw scatter nondurables_annual ces_nondur_minus_AV year, ///
	/*title("Nondurable Goods Spending:" "Regression Sample vs. Consumer Expenditure Survey", size(medsmall))*/ ///
	ylab(12000(2000)22000, ang(h) labsize(medlarge)) xlab(, labsize(medlarge)) ytitle("Expenditure ({c S|})", size(medlarge)) ///
	legend(rows(2) size(medlarge) order(1 "Nondurable Goods Spending, RAND-ALP" 2 "Nondurable Goods Spending, CES" ) region(lstyle(none))) ///
	msize(large large) xtitle("")
	graph save "../Figures/1b.gph", replace
graph export ../Figures/figure_1b.png, as(png) replace

graph combine "../Figures/1a" "../Figures/1b", name("fig1", replace) cols(2) iscale(1.2) ysize(2.1) 
graph export "../Figures/figure_1.png", as(png) replace

set scheme s1mono

tw scatter nondurables_annual ces_nondur_minus_AV year, ///
	/*title("Nondurable Goods Spending:" "Regression Sample vs. Consumer Expenditure Survey", size(medsmall))*/ ///
	ylab(12000(2000)22000, ang(h) labsize(medlarge)) xlab(, labsize(medlarge)) ytitle("Expenditure ({c S|})", size(medlarge)) ///
	legend(rows(2) size(medlarge) order(1 "Nondurable Goods Spending, RAND-ALP" 2 "Nondurable Goods Spending, CES" ) region(lstyle(none))) ///
	msize(large large) xtitle("")
	graph save "../Figures/1b_bw.gph", replace
graph export ../Figures/figure_1b_bw.png, as(png) replace

graph combine "../Figures/1a_bw" "../Figures/1b_bw", name("fig1_bw", replace) cols(2) iscale(1.2) ysize(2.1) 
graph export "../Figures/figure_1_bw.png", as(png) replace


/*
	
tw scatter nondurables_annual ces_intermediate year, ///
	title("Nondurable Goods Spending:" "Regression Sample vs. Consumer Expenditure Survey Intermediate Bundle", size(medsmall)) ylab(12000(2000)22000, ang(h)) ///
	legend(rows(2) size(small) order(1 "Nondurable Goods Spending, Regression Sample" 2 "Nondurable Goods Spending, Consumer Expenditure Survey Int. Bundle" )) ///
	msize(large large)

graph export ces_nondurables_comparison_intermediate_bundle_MAB.png, as(png) replace

* go with the minus_AV version

*try with the maximal bundle just for kicks

tw scatter nondurables_annual ces_nondur_maximal year, ///
	title("Nondurable Goods Spending:" "Regression Sample vs. Consumer Expenditure Survey Maximal Bundle", size(medsmall)) ylab(12000(2000)22000, ang(h)) ///
	legend(rows(2) size(small) order(1 "Nondurable Goods Spending, Regression Sample" 2 "Nondurable Goods Spending, Consumer Expenditure Survey Maximal Bundle" )) ///
	msize(large large)

graph export ces_nondurables_comparison_maximal_bundle_MAB.png, as(png) replace

*now one with both the minus-AV and maximal against ours

tw scatter nondurables_annual ces_nondur_maximal ces_nondur_minus_AV year, ///
	title("Nondurable Goods Spending:" "Regression Sample vs. Consumer Expenditure Survey Bundles", size(medsmall)) ylab(12000(2000)22000, ang(h)) ///
	legend(rows(3) size(small) order(1 "Nondurable Goods Spending, Regression Sample" 2 "Nondurable Goods Spending, Consumer Expenditure Survey Maximal Bundle" 3 "Consumer Expenditure Survey Bundle Minus AV" )) ///
	msize(large large large)

graph export ces_nondurables_comparison_dual_MAB.png, as(png) replace

	
MAB


*/
